Importing Required Libraries¶
# Data Manipulation
import numpy as np
import pandas as pd
# Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import plotly
plotly.offline.init_notebook_mode()
from plotly.subplots import make_subplots
# Machine Learning
from sklearn import metrics
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn import preprocessing
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
# Imbalanced Data Handling
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
# Others
import warnings
warnings.filterwarnings('ignore')
Loading the Dataset¶
data = pd.read_csv("./stroke.csv")
data.drop(columns=['id'],inplace=True)
data
| gender | age | hypertension | heart_disease | ever_married | work_type | Residence_type | avg_glucose_level | bmi | smoking_status | stroke | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Male | 67.0 | 0 | 1 | Yes | Private | Urban | 228.69 | 36.6 | formerly smoked | 1 |
| 1 | Female | 61.0 | 0 | 0 | Yes | Self-employed | Rural | 202.21 | NaN | never smoked | 1 |
| 2 | Male | 80.0 | 0 | 1 | Yes | Private | Rural | 105.92 | 32.5 | never smoked | 1 |
| 3 | Female | 49.0 | 0 | 0 | Yes | Private | Urban | 171.23 | 34.4 | smokes | 1 |
| 4 | Female | 79.0 | 1 | 0 | Yes | Self-employed | Rural | 174.12 | 24.0 | never smoked | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 5105 | Female | 80.0 | 1 | 0 | Yes | Private | Urban | 83.75 | NaN | never smoked | 0 |
| 5106 | Female | 81.0 | 0 | 0 | Yes | Self-employed | Urban | 125.20 | 40.0 | never smoked | 0 |
| 5107 | Female | 35.0 | 0 | 0 | Yes | Self-employed | Rural | 82.99 | 30.6 | never smoked | 0 |
| 5108 | Male | 51.0 | 0 | 0 | Yes | Private | Rural | 166.29 | 25.6 | formerly smoked | 0 |
| 5109 | Female | 44.0 | 0 | 0 | Yes | Govt_job | Urban | 85.28 | 26.2 | Unknown | 0 |
5110 rows × 11 columns
Exploratory Data Analysis¶
Checking for Duplicates¶
data.duplicated().any()
False
Exploring the Features and Labels¶
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 5110 entries, 0 to 5109 Data columns (total 11 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 gender 5110 non-null object 1 age 5110 non-null float64 2 hypertension 5110 non-null int64 3 heart_disease 5110 non-null int64 4 ever_married 5110 non-null object 5 work_type 5110 non-null object 6 Residence_type 5110 non-null object 7 avg_glucose_level 5110 non-null float64 8 bmi 4909 non-null float64 9 smoking_status 5110 non-null object 10 stroke 5110 non-null int64 dtypes: float64(3), int64(3), object(5) memory usage: 439.3+ KB
Checking Null Values¶
data.isnull().sum()
gender 0 age 0 hypertension 0 heart_disease 0 ever_married 0 work_type 0 Residence_type 0 avg_glucose_level 0 bmi 201 smoking_status 0 stroke 0 dtype: int64
Visualizations¶
Missing Values¶
fig = px.imshow(data.isna().transpose(),color_continuous_scale="Blues")
fig.show()
Pie Charts¶
fig = make_subplots(
rows=4, cols=2, subplot_titles=("gender", "hypertension",
"heart_disease","ever_married",
"work_type", "Residence_type",
'smoking_status', 'stroke'),
specs=[[{"type": "domain"}, {"type": "domain"}],
[{"type": "domain"}, {"type": "domain"}],
[{"type": "domain"}, {"type": "domain"}],
[{"type": "domain"}, {"type": "domain"}]],
)
colours = ['#4285f4', '#ea4335', '#fbbc05', '#34a853']
fig.add_trace(go.Pie(labels=np.array(data['gender'].value_counts().index),
values=[x for x in data['gender'].value_counts()],
textinfo='label+percent', rotation=-45, hole=.35,
marker_colors=colours),
row=1, col=1)
fig.add_trace(go.Pie(labels=np.array(data['hypertension'].value_counts().index),
values=[x for x in data['hypertension'].value_counts()],
textinfo='label+percent', hole=.35,
marker_colors=colours),
row=1, col=2)
fig.add_trace(go.Pie(labels=np.array(data['heart_disease'].value_counts().index),
values=[x for x in data['heart_disease'].value_counts()],
textinfo='label+percent', rotation=-45, hole=.35,
marker_colors=colours),
row=2, col=1)
fig.add_trace(go.Pie(labels=np.array(data['ever_married'].value_counts().index),
values=[x for x in data['ever_married'].value_counts()],
textinfo='label+percent', rotation=-45, hole=.35,
marker_colors=colours),
row=2, col=2)
fig.add_trace(go.Pie(labels=np.array(data['work_type'].value_counts().index),
values=[x for x in data['work_type'].value_counts()],
textinfo='label+percent', hole=.35,
marker_colors=colours),
row=3, col=1)
fig.add_trace(go.Pie(labels=np.array(data['Residence_type'].value_counts().index),
values=[x for x in data['Residence_type'].value_counts()],
textinfo='label+percent', hole=.35,
marker_colors=colours),
row=3, col=2)
fig.add_trace(go.Pie(labels=np.array(data['smoking_status'].value_counts().index),
values=[x for x in data['smoking_status'].value_counts()],
textinfo='label+percent', rotation=-45, hole=.35,
marker_colors=colours),
row=4, col=1)
fig.add_trace(go.Pie(labels=np.array(data['stroke'].value_counts().index),
values=[x for x in data['stroke'].value_counts()],
rotation=-45, textinfo='label+percent', hole=.35,
marker_colors=colours),
row=4, col=2)
fig.update_layout(height=2000, font=dict(size=14), showlegend=False)
fig.show()
Histograms Relations¶
fig = px.histogram(data, x="ever_married", color='stroke', color_discrete_map = {'Yes':'#ea4335','No':'#4285f4'})
fig.show()
Insights:
- Most people who had a stroke are married
Stroke & Heart Disease¶
fig = px.histogram(data, x="heart_disease", color='stroke', color_discrete_map = {'Yes':'#ea4335','No':'#4285f4'})
fig.show()
Insights:
- The number of people who had heart disease is much lower than the number of people who did not.
- The proportion of people who had a stroke in the heart_disease category is much higher than the proportion of people who had a stroke in the No heart_disease category.
- But the number of people who had heart disease is significantly lower than the number of people who didn't, so it cannot be confidently concluded that people with heart disease is more likely to suffer from a stroke than people with no heart disease.
Stroke & Hypertension¶
fig = px.histogram(data, x="hypertension", color='stroke', color_discrete_map = {'Yes':'#ea4335','No':'#4285f4'})
fig.show()
Insights:
The number of people who has hypertension is much lower than the number of people who do not.
The proportion of people who had a stroke in the hypertension category is much higher than the proportion of people who had a stroke in the No hypertension category.
But the number of people who had hypertension is significantly lower than the number of people who didn't, so it cannot be confidently concluded that people with hypertension is more likely to suffer from a stroke than people with no hypertension.
Replacing BMI Null Values with it's mean¶
data.bmi.fillna(data.bmi.mean(),inplace=True)
Exploring Numerical Features¶
numerical = data.select_dtypes(include=['int64','float64']).columns.tolist()
numerical
['age', 'hypertension', 'heart_disease', 'avg_glucose_level', 'bmi', 'stroke']
data[numerical[:-1]] = StandardScaler().fit_transform(data[numerical[:-1]])
Exploring Categorical Features¶
categories = data.select_dtypes(include=['object']).columns.tolist()
categories
['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']
Encoding Gender¶
data.gender.value_counts()
gender Female 2994 Male 2115 Other 1 Name: count, dtype: int64
data.gender = data.gender.map({'Male' : 1,'Female' : 0, 'Other': 2})
Encoding Marital Status¶
data.ever_married.value_counts()
ever_married Yes 3353 No 1757 Name: count, dtype: int64
data.ever_married = data.ever_married.map({'Yes':1,'No':0})
Encoding Type of Work¶
data.work_type.value_counts()
work_type Private 2924 Self-employed 819 children 687 Govt_job 657 Never_worked 22 Name: count, dtype: int64
data.work_type = data.work_type.map({'Private':0,'Self-employed':1,'children':2,'Govt_job':3,'Never_worked':4})
Encoding Residence¶
data.Residence_type.value_counts()
Residence_type Urban 2596 Rural 2513 Name: count, dtype: int64
data.Residence_type = data.Residence_type.map({'Urban':1,'Rural':0})
Encoding Smoking¶
data.smoking_status.value_counts()
smoking_status never smoked 1892 Unknown 1544 formerly smoked 884 smokes 789 Name: count, dtype: int64
data.smoking_status = data.smoking_status.map({'Unknown':0,'smokes':1,'formerly smoked':2,'never smoked':3})
Implementing Scalers¶
minmax_scaler = preprocessing.MinMaxScaler()
robust_scaler = preprocessing.RobustScaler()
standard_scaler = preprocessing.StandardScaler()
Splitting the Features and Label from the Dataset¶
X = data.drop('stroke',axis=1)
y = data["stroke"]
X.shape,y.shape
((5109, 10), (5109,))
y.value_counts()
stroke 0 4860 1 249 Name: count, dtype: int64
Sampling¶
over_samp = SMOTE(sampling_strategy=0.1)
under_samp = RandomUnderSampler(sampling_strategy=0.5)
steps = [('over', over_samp), ('under', under_samp)]
pipeline = Pipeline(steps=steps)
X, y = pipeline.fit_resample(X, y)
Splitting the Dataset into Training and Testing¶
# Split the data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=19, stratify=y)
X_train = standard_scaler.fit_transform(X_train)
X_test = standard_scaler.transform(X_test)
Multi-Layer Perceptron ( MLP ) Model¶
# Define the base MLP classifier
mlp_clf = MLPClassifier(random_state=19, max_iter=25000)
# Define the parameters grid
mlp_parameters = {
'hidden_layer_sizes': [(100,), (50, 50), (50, 100, 50)], # Adjust hidden layer sizes
'activation': ['relu', 'sigmoid', 'logistic'], # Activation functions
'solver': ['adam', 'lbfgs'], # Optimizer
'max_iter': [25000, 25000], # Maximum number of iterations
'learning_rate': ['constant', 'invscaling', 'adaptive'], # Learning rate schedule
'warm_start': [True, False] # Warm start
}
Training the model¶
from sklearn.exceptions import ConvergenceWarning
# Create the GridSearchCV object
warnings.filterwarnings("ignore", category=ConvergenceWarning)
model_MLP = GridSearchCV(mlp_clf, param_grid = mlp_parameters, cv=5, n_jobs=-1)
# Fit the model on the training data
model_MLP.fit(X_train, y_train)
GridSearchCV(cv=5, estimator=MLPClassifier(max_iter=25000, random_state=19),
n_jobs=-1,
param_grid={'activation': ['relu', 'sigmoid', 'logistic'],
'hidden_layer_sizes': [(100,), (50, 50),
(50, 100, 50)],
'learning_rate': ['constant', 'invscaling',
'adaptive'],
'max_iter': [25000, 25000],
'solver': ['adam', 'lbfgs'],
'warm_start': [True, False]})In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GridSearchCV(cv=5, estimator=MLPClassifier(max_iter=25000, random_state=19),
n_jobs=-1,
param_grid={'activation': ['relu', 'sigmoid', 'logistic'],
'hidden_layer_sizes': [(100,), (50, 50),
(50, 100, 50)],
'learning_rate': ['constant', 'invscaling',
'adaptive'],
'max_iter': [25000, 25000],
'solver': ['adam', 'lbfgs'],
'warm_start': [True, False]})MLPClassifier(max_iter=25000, random_state=19)
MLPClassifier(max_iter=25000, random_state=19)
Evaluating Performance of the Best Model¶
print("Best Score: " + str(model_MLP.best_score_) + '\n')
print("Best params: " + str(model_MLP.best_params_)+ '\n')
Best Score: 0.808767103187704
Best params: {'activation': 'relu', 'hidden_layer_sizes': (50, 50), 'learning_rate': 'constant', 'max_iter': 25000, 'solver': 'adam', 'warm_start': True}
Calculating F1 Score on Test Dataset¶
from sklearn.metrics import f1_score
# Once you have found the best hyperparameters, retrain the model on the entire training set
best_clf = MLPClassifier(random_state=19, **model_MLP.best_params_)
best_clf.fit(X_train, y_train)
# Evaluate the model on the test set
test_score = best_clf.score(X_test, y_test)
print("Test Score:", test_score)
# Predict on the test set
y_pred = best_clf.predict(X_test)
# Calculate F1-Score
f1 = f1_score(y_test, y_pred)
print("F1-Score on Test Set:", f1)
Test Score: 0.7945205479452054 F1-Score on Test Set: 0.7058823529411764
Implementing PCA on the Scaled Dataset¶
# Assuming 'data' is your preprocessed DataFrame
X_scaled = StandardScaler().fit_transform(X)
# Perform PCA
pca = PCA()
pca.fit(X_scaled)
# Calculate explained variance ratio
explained_variance_ratio = pca.explained_variance_ratio_
# Determine the optimal number of dimensions
cumulative_variance_ratio = np.cumsum(explained_variance_ratio)
optimal_num_dimensions = np.argmax(cumulative_variance_ratio >= 0.95) + 1 # 95% explained variance
print("Optimal number of dimensions:", optimal_num_dimensions)
Optimal number of dimensions: 9
Visualizing the Variance Ratio¶
plt.figure(figsize=(10, 6))
plt.plot(range(1, len(explained_variance_ratio) + 1), explained_variance_ratio, marker='o', linestyle='-')
plt.title('Scree Plot')
plt.xlabel('Number of Components')
plt.ylabel('Explained Variance Ratio')
plt.grid(True)
plt.show()
Saving the Dataset after PCA¶
# Transform the original dataset using the optimal number of dimensions
pca = PCA(n_components=optimal_num_dimensions)
X_pca = pca.fit_transform(X_scaled)
# Create a DataFrame from the transformed data
columns = [f"PC{i}" for i in range(1, optimal_num_dimensions + 1)]
df_pca = pd.DataFrame(data=X_pca, columns=columns)
# Concatenate the transformed DataFrame with the target variable 'y'
df_pca['stroke'] = y.reset_index(drop=True)
# Save the DataFrame to a CSV file
df_pca.to_csv("stroke_pca_dataset.csv", index=False)
print("PCA dataset saved successfully.")
PCA dataset saved successfully.
Loading the PCA Dataset¶
df_pca = pd.read_csv("stroke_pca_dataset.csv")
df_pca
| PC1 | PC2 | PC3 | PC4 | PC5 | PC6 | PC7 | PC8 | PC9 | stroke | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -0.828851 | -0.539826 | 0.140010 | 0.768875 | -1.469538 | -0.376646 | -0.105234 | 0.669137 | -0.594278 | 0 |
| 1 | 2.647854 | 2.530899 | 1.701178 | -1.180675 | 1.219302 | 2.429631 | -1.368331 | -0.685882 | -0.105560 | 0 |
| 2 | -0.324647 | -0.791949 | 1.281082 | -0.018071 | 0.096322 | -1.478234 | 0.535007 | -0.916096 | 1.234680 | 0 |
| 3 | -1.366253 | -1.159091 | -0.568314 | 0.879653 | 0.236135 | 1.640543 | -0.695416 | -0.497358 | 0.049711 | 0 |
| 4 | -2.765878 | 1.803329 | -0.085645 | 0.705587 | 0.616476 | -0.287631 | -0.057494 | -0.047156 | -0.127514 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1453 | 0.149510 | -1.557712 | -0.181262 | -1.011466 | -0.675328 | -0.041219 | -0.028323 | 0.404460 | 0.140922 | 1 |
| 1454 | 1.446376 | 1.671203 | -0.581717 | -1.330187 | 0.054811 | -1.021143 | 1.565213 | -0.357514 | -1.371487 | 1 |
| 1455 | 2.169568 | 2.991093 | -0.943709 | 0.145095 | -1.159379 | 0.862523 | -0.172687 | -0.836594 | 0.790489 | 1 |
| 1456 | -0.465107 | -1.107611 | -0.273717 | 1.418869 | 0.350243 | 0.868377 | -0.462065 | -0.612504 | 1.087426 | 1 |
| 1457 | -0.260618 | -1.402926 | -0.449053 | 0.687002 | -0.913823 | 0.426920 | -0.858289 | -0.083513 | -0.902061 | 1 |
1458 rows × 10 columns
Implementing MLP on the Dataset with reduced Dimentions¶
X_pca = df_pca.drop('stroke', axis=1)
y_pca = df_pca['stroke']
# Scale the features
scaler = StandardScaler()
X_scaled_pca = scaler.fit_transform(X_pca)
# Define the base MLP classifier
mlp_clf = MLPClassifier(random_state=19, max_iter=25000)
# Define the parameters grid
mlp_parameters = {
'hidden_layer_sizes': [(100,), (50, 50), (50, 100, 50)],
'activation': ['relu', 'sigmoid', 'logistic'],
'solver': ['adam', 'lbfgs'],
'max_iter': [25000, 25000],
'learning_rate': ['constant', 'invscaling', 'adaptive'],
'warm_start': [True, False]
}
MLP_pca = GridSearchCV(mlp_clf, param_grid=mlp_parameters, cv=5, n_jobs=-1)
# Fit the model on the entire dataset
MLP_pca.fit(X_scaled_pca, y_pca)
print("Best Score: " + str(MLP_pca.best_score_) + '\n')
print("Best params: " + str(MLP_pca.best_params_) + '\n')
Best Score: 0.8148966718448432
Best params: {'activation': 'relu', 'hidden_layer_sizes': (100,), 'learning_rate': 'constant', 'max_iter': 25000, 'solver': 'adam', 'warm_start': True}
Evaluating the Model on the Test Dataset¶
from sklearn.metrics import f1_score
best_clf = MLPClassifier(random_state=19, **MLP_pca.best_params_)
best_clf.fit(X_train, y_train)
# Evaluate the model on the test set
test_score = best_clf.score(X_test, y_test)
print("Test Score:", test_score)
# Predict on the test set and calculate F1-Score
y_pred = best_clf.predict(X_test)
f1 = f1_score(y_test, y_pred)
print("F1-Score on Test Set:", f1)
Test Score: 0.8287671232876712 F1-Score on Test Set: 0.7395833333333333
Model Score on Test Set:
- After dimensionality reduction, the model attains an accuracy of around 82.8% on the test set.
F1-Score (Precision and Recall):
- The F1-score, which accounts for both precision and recall, is approximately 0.739 on the test set.
Evaluating Performance Improvement through Dimensionality Reduction¶
After analyzing the results from both Step 4 and Step 6, it's crucial to determine if dimensionality reduction has positively impacted performance.
| Model | Test Score | F1-Score |
|---|---|---|
| Model 1 (Without PCA) | 0.794 | 0.705 |
| Model 2 (With PCA) | 0.828 | 0.739 |
Analysis:
- Test Score: Model 2 demonstrated superior overall accuracy compared to Model 1.
- F1-Score: Model 2 achieved a slightly higher equilibrium between precision and recall compared to Model 1.
Conclusion:
The comparison of Model 1 (without PCA) and Model 2 (with PCA) demonstrates that Model 2 outperformed Model 1 in both overall test score and F1-score. This suggests that incorporating PCA improved the model's accuracy and achieved a better balance between precision and recall.
Additionally, the comparison clearly indicates that dimensionality reduction led to performance enhancement in the model. Both the test scores and F1-scores were notably higher when dimensionality reduction was employed, signifying improved accuracy and a better balance between precision and recall. This underscores the effectiveness of incorporating dimensionality reduction techniques in enhancing model performance.